In [1]:
import graphlab as gl
gl.canvas.set_target("ipynb")
In [2]:
# Download and parse
ratings = gl.SFrame.read_csv('ml-1m/ratings.dat', delimiter='::', header=False)
items = gl.SFrame.read_csv('ml-1m/movies.dat', delimiter='::', header=False)
# Rename columns
ratings = ratings.rename({'X1': 'user_id', 'X2': 'item_id', 'X3': 'score', 'X4': 'timestamp'})
items = items.rename({'X1': 'item_id', 'X2': 'title_year', 'X3': 'genres'})
In [3]:
ratings
Out[3]:
In [4]:
items
Out[4]:
In [5]:
items.show()
In [6]:
items['title'] = items['title_year'].apply(lambda x: x[:-7])
items['title'] = items['title'].apply(lambda x: x.decode('iso8859').encode('utf-8'))
items['year'] = items['title_year'].apply(lambda x: x[-5:-1])
items['genres'] = items['genres'].apply(lambda x: x.split('|'))
del items['title_year']
In [7]:
items
Out[7]:
How many unique users do we have?
In [8]:
ratings['user_id'].unique().size()
Out[8]:
In [9]:
items.show()
In [10]:
explicit = ratings[['user_id', 'item_id', 'score']]
explicit
Out[10]:
In [12]:
implicit = explicit[explicit['score'] >= 4.0][['user_id', 'item_id']]
implicit
Out[12]:
In [15]:
m = gl.recommender.create(implicit, 'user_id', 'item_id')
The above model trained an item_similarity
model. This computed Jaccard similarities between the items in this dataset, then for each item it ranks the top 100 most similar items, storing these so they can be used at prediction time. For more information on how this model works, see the API reference.
In [16]:
m
Out[16]:
In [43]:
items[items['item_id'] == 1287]
Out[43]:
In [44]:
m.get_similar_items([1287], k=5)
Out[44]:
In [45]:
m.get_similar_items([1287]).join(items, on={'similar': 'item_id'}).sort('rank')
Out[45]:
In [46]:
m2 = gl.recommender.create(explicit, 'user_id', 'item_id', target='score')
In [47]:
recs = m.recommend()
In [48]:
recs
Out[48]:
In [49]:
ratings[ratings['user_id'] == 4].join(items, on='item_id')
Out[49]:
In [50]:
m.recommend(users=[4], k=20).join(items, on='item_id').sort('rank')
Out[50]:
In [51]:
m.recommend?
In [53]:
recent_data = gl.SFrame()
recent_data['item_id'] = [1291] # Indiana Jones and the Last Crusade
recent_data['user_id'] = 99999
In [54]:
m.recommend(users=[99999], new_observation_data=recent_data).join(items, on='item_id').sort('rank')
Out[54]:
In [55]:
m.save('my_model')
In [56]:
m_again = gl.load_model('my_model')
In [57]:
m_again
Out[57]:
In [58]:
items.save('items')
ratings.save('ratings')
explicit.save('explicit')
implicit.save('implicit')
In [ ]: